Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
Library documentation: http://pandas.pydata.org/
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
# create a series
s = pd.Series([1,3,5,np.nan,6,8])
s
Out[2]:
In [3]:
# create a data frame
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df
Out[3]:
In [4]:
# another way to create a data frame
df2 = pd.DataFrame(
{ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : 'foo' })
df2
Out[4]:
In [5]:
df2.dtypes
Out[5]:
In [6]:
df.head()
Out[6]:
In [7]:
df.index
Out[7]:
In [8]:
df.columns
Out[8]:
In [9]:
df.values
Out[9]:
In [10]:
# quick data summary
df.describe()
Out[10]:
In [11]:
df.T
Out[11]:
In [12]:
# axis 0 is index, axis 1 is columns
df.sort_index(axis=1, ascending=False)
Out[12]:
In [13]:
# can sort by values too
df.sort(columns='B')
Out[13]:
In [14]:
# select a column (yields a series)
df['A']
Out[14]:
In [15]:
# column names also attached to the object
df.A
Out[15]:
In [16]:
# slicing works
df[0:3]
Out[16]:
In [17]:
df['20130102':'20130104']
Out[17]:
In [18]:
# cross-section using a label
df.loc[dates[0]]
Out[18]:
In [19]:
# getting a scalar value
df.loc[dates[0], 'A']
Out[19]:
In [20]:
# select via position
df.iloc[3]
Out[20]:
In [21]:
df.iloc[3:5,0:2]
Out[21]:
In [22]:
# column slicing
df.iloc[:,1:3]
Out[22]:
In [23]:
# get a value by index
df.iloc[1,1]
Out[23]:
In [24]:
# boolean indexing
df[df.A > 0]
Out[24]:
In [25]:
df[df > 0]
Out[25]:
In [26]:
# filtering
df3 = df.copy()
df3['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df3[df3['E'].isin(['two', 'four'])]
Out[26]:
In [27]:
# setting examples
df.at[dates[0],'A'] = 0
df.iat[0,1] = 0
df.loc[:, 'D'] = np.array([5] * len(df))
df
Out[27]:
In [28]:
# dealing with missing data
df4 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E'])
df4.loc[dates[0]:dates[1],'E'] = 1
df4
Out[28]:
In [29]:
# drop rows with missing data
df4.dropna(how='any')
Out[29]:
In [30]:
# fill missing data
df4.fillna(value=5)
Out[30]:
In [31]:
# boolean mask for nan values
pd.isnull(df4)
Out[31]:
In [32]:
df.mean()
Out[32]:
In [33]:
# pivot the mean calculation
df.mean(1)
Out[33]:
In [34]:
# aligning objects with different dimensions
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
df.sub(s,axis='index')
Out[34]:
In [35]:
# applying functions
df.apply(np.cumsum)
Out[35]:
In [36]:
df.apply(lambda x: x.max() - x.min())
Out[36]:
In [37]:
# simple count aggregation
s = pd.Series(np.random.randint(0,7,size=10))
s.value_counts()
Out[37]:
In [38]:
# concatenation
df = pd.DataFrame(np.random.randn(10, 4))
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)
Out[38]:
In [39]:
# SQL-style join
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
pd.merge(left, right, on='key')
Out[39]:
In [40]:
# append
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
s = df.iloc[3]
df.append(s, ignore_index=True)
Out[40]:
In [41]:
df = pd.DataFrame(
{ 'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8) })
df
Out[41]:
In [42]:
# group by
df.groupby('A').sum()
Out[42]:
In [43]:
# group by multiple columns
df.groupby(['A','B']).sum()
Out[43]:
In [44]:
df = pd.DataFrame(
{ 'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)} )
df
Out[44]:
In [45]:
# pivot table
pd.pivot_table(df, values='D', rows=['A', 'B'], columns=['C'])
Out[45]:
In [46]:
# time period resampling
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample('5Min', how='sum')
Out[46]:
In [47]:
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
Out[47]:
In [48]:
ps = ts.to_period()
ps.to_timestamp()
Out[48]:
In [49]:
# time series plot
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
Out[49]:
In [50]:
# plot with a data frame
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')
Out[50]:
In [51]:
# write to a csv file
df.to_csv('foo.csv', index=False)
In [52]:
# read file back in
path = r'C:\Users\John\Documents\IPython Notebooks\foo.csv'
newDf = pd.read_csv(path)
newDf.head()
Out[52]:
In [53]:
# remove the file
import os
os.remove(path)
In [54]:
# can also do Excel
df.to_excel('foo.xlsx', sheet_name='Sheet1')
In [55]:
newDf2 = pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
newDf2.head()
Out[55]:
In [56]:
os.remove('foo.xlsx')